import pandas as pd
import numpy as np
import statsmodels.api as sm
import plotly.express as px
import matplotlib.pyplot as plt
import yfinance as yf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.tree import DecisionTreeRegressor
from sklearn.multioutput import RegressorChain
from sklearn.ensemble import GradientBoostingRegressor
def convert_placeholder_text_to_num(text):
'''
Parsing helper script. In a lot of investing.com data e.g.6.1 M is used to indicate 6.1 million. This script converts the string
into float for machine learning to be carried out.
'''
result = text
try:
if text[-1] == 'M':
result = float(text[:-2]) * 10**6
elif text[-1] == 'B':
result = float(text[:-2]) * 10**9
elif text[-1] == 'K':
result = float(text[:-2]) * 10**3
elif text[-1] == '-':
result = np.nan
elif text[-1] == 'nan':
result = np.nan
elif text[-1] == 'NA':
result = np.nan
elif text[-1] == 'x':
result = float(text[:-1])
elif text[-1] == '%':
result = text.replace(",","")
result = float(result[:-1])*0.01
except Exception as e:
print(e)
return result
def get_historical_yfinance(ticker):
'''
From ticker string in dataframes above, download historical data (5Y), skip if it's not present
'''
try:
ticker = f'{ticker:04}'+'.HK' # convert ticker to yfinace form
asset = yf.Ticker(ticker)
hist = asset.history(period="3Y")
except error as e:
print(e)
print(str(ticker))
print('\n')
return hist
# old code to download historical price data (doesn't work)
#historical_price_data = {}
#for elem in test_merge.Ticker:
# historical_price_data[elem] = get_historical_yfinance(elem)
# import datasets
EV = pd.read_csv('data/ratios/EV.csv', index_col=0, parse_dates=True, infer_datetime_format=True, thousands=',')
fcf = pd.read_csv('data/ratios/fcf.csv', index_col=0, parse_dates=True, infer_datetime_format=True, thousands=',')
ebitda = pd.read_csv('data/ratios/ebitda.csv', index_col=0, parse_dates=True, infer_datetime_format=True, thousands=',')
revenue = pd.read_csv('data/ratios/revenue.csv', index_col=0, parse_dates=True, infer_datetime_format=True, thousands=',')
roe = pd.read_csv('data/ratios/roe.csv', index_col=0, parse_dates=True, infer_datetime_format=True, thousands=',')
gross_profit_margin = pd.read_csv('data/ratios/gross-profit-margin.csv', index_col=0, parse_dates=True, infer_datetime_format=True, thousands=',')
quick_ratio = pd.read_csv('data/ratios/quick-ratio.csv', index_col=0, parse_dates=True, infer_datetime_format=True, thousands=',')
debt_to_equity = pd.read_csv('data/ratios/debt-to-equity.csv', index_col=0, parse_dates=True, infer_datetime_format=True, thousands=',')
# remove duplicates
EV = EV.drop_duplicates()
fcf = fcf.drop_duplicates()
ebitda = ebitda.drop_duplicates()
revenue = revenue.drop_duplicates()
roe = roe.drop_duplicates()
gross_profit_margin = gross_profit_margin.drop_duplicates()
quick_ratio = quick_ratio.drop_duplicates()
debt_to_equity = debt_to_equity.drop_duplicates()
# set up for merging dataframes
EV = EV.reset_index()
fcf = fcf.reset_index()
ebitda = ebitda.reset_index()
revenue = revenue.reset_index()
roe = roe.reset_index()
gross_profit_margin = gross_profit_margin.reset_index()
quick_ratio = quick_ratio.reset_index()
debt_to_equity = debt_to_equity.reset_index()
EV = EV.astype(str)
fcf = fcf.astype(str)
ebitda = ebitda.astype(str)
revenue = revenue.astype(str)
roe = roe.astype(str)
gross_profit_margin = gross_profit_margin.astype(str)
quick_ratio = quick_ratio.astype(str)
debt_to_equity = debt_to_equity.astype(str)
# merge dataframes into one big one
dataframes = [fcf,ebitda,revenue,roe,gross_profit_margin,quick_ratio,debt_to_equity]
test_merge = EV
for elem in dataframes:
test_merge = pd.merge(test_merge, elem, how = 'outer',
left_on=['Ticker','Name','Sector','Industry','Industry Group','Market Cap','Last Reported Fiscal Year'],
right_on=['Ticker','Name','Sector','Industry','Industry Group','Market Cap','Last Reported Fiscal Year'])
#test_merge = pd.merge(EV.reset_index(), fcf.reset_index(), how = 'inner',
#left_on=['Ticker','Name','Sector','Industry','Industry Group','Market Cap','Last Reported Fiscal Year'],
#right_on=['Ticker','Name','Sector','Industry','Industry Group','Market Cap','Last Reported Fiscal Year'])
# dataset for ML all sectors
# test_merge.dropna().to_csv('test.csv')
# See comments in next cell
# ticker_string = ''
#
# for elem in test_merge.Ticker:
# ticker_string = ticker_string + f'{elem:04}'+'.HK '
#
# ticker_string = ticker_string[:-1] # drop the last white space character
# original code to download all historical price data.
# DON'T RUN! Very slow, import the csv instead (next cell).
#test_assets = yf.download(tickers = ticker_string, # list of tickers
# period = "5y", # time period
# interval = "1mo", # trading interval
# ignore_tz = True, # ignore timezone when aligning data from different exchanges?
# prepost = False) # download pre/post market hours data?
#test_assets.to_pickle('data/price_data_complete.pickle')
test_assets = pd.read_pickle('data/price_data_complete.pickle')
#test_assets = test_assets.set_index('Date')
test_assets.head()
| Adj Close | ... | Volume | |||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1000.HK | 1003.HK | 1005.HK | 1010.HK | 1020.HK | 1023.HK | 1029.HK | 1030.HK | 1033.HK | 1034.HK | ... | 9906.HK | 9908.HK | 9910.HK | 9913.HK | 9920.HK | 9930.HK | 9936.HK | 9950.HK | 9980.HK | 9986.HK | |
| Date | |||||||||||||||||||||
| 2018-04-01 | 2.76 | 2.00 | 2.509840 | 2.06 | 0.270 | 1.224793 | 0.168 | 5.405748 | 1.16 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2018-05-01 | 3.25 | 1.97 | 2.641937 | 2.18 | 0.270 | 1.210384 | 0.148 | 5.060175 | 1.27 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2018-06-01 | 3.00 | 2.00 | 2.641937 | 2.25 | 0.260 | 1.210384 | 0.134 | 6.030394 | 1.05 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2018-07-01 | 2.40 | 2.09 | 2.457001 | 2.22 | 0.249 | 1.628255 | 0.137 | 5.987926 | 1.07 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2018-08-01 | 2.50 | 1.87 | 2.457001 | 2.12 | 0.218 | 1.621050 | 0.113 | 5.155562 | 1.01 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 5202 columns
asset_prices = test_assets['Adj Close'].dropna(axis=1)
ML_data = test_merge.dropna().applymap(convert_placeholder_text_to_num).dropna()
could not convert string to float: '' could not convert string to float: '' could not convert string to float: '' could not convert string to float: '' could not convert string to float: '' could not convert string to float: ''
ML_data.head()
| Ticker | Name | Sector | Industry | Industry Group | Market Cap | Last Reported Fiscal Year | EV | EV FQ-1 | EV FQ-2 | ... | Debt / Equity FQ-3 | Debt / Equity FQ-4 | Debt / Equity FQ-5 | Debt / Equity FQ-6 | Debt / Equity FQ-7 | Debt / Equity FQ-8 | Debt / Equity FQ-9 | Debt / Equity FQ-10 | Debt / Equity FQ-11 | Debt / Equity FQ-12 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 199 | ITC Properties Group Limited | Real Estate | Real Estate Management & Development | Real Estate | 1.221000e+08 | 2023 | 4.833800e+07 | 75623000.0 | 101000000.0 | ... | 0.515 | 0.899 | 0.899 | 0.773 | 0.773 | 0.836 | 0.836 | 0.839 | 0.839 | 0.847 |
| 1 | 2322 | Hong Kong ChaoShang Group Limited | Industrials | Trading Companies & Distributors | Capital Goods | 3.832000e+08 | 2023 | 3.652000e+08 | 438800000.0 | 409700000.0 | ... | 0.015 | 0.007 | 0.007 | 0.021 | 0.021 | 0.028 | 0.028 | 0.037 | 0.037 | 0.044 |
| 5 | 179 | Johnson Electric Holdings Limited | Consumer Discretionary | Auto Components | Automobiles & Components | 1.056000e+09 | 2023 | 1.265000e+09 | 1435000000.0 | 1384000000.0 | ... | 0.256 | 0.28 | 0.28 | 0.213 | 0.213 | 0.247 | 0.247 | 0.26 | 0.26 | 0.224 |
| 9 | 989 | Hua Yin International Holdings Limited | Real Estate | Real Estate Management & Development | Real Estate | 3.854000e+08 | 2023 | 5.113000e+08 | 592700000.0 | 537900000.0 | ... | -4.184 | -4.0 | -4.0 | -3.746 | -3.746 | -1.337 | -1.337 | -1.472 | -1.472 | 1.822 |
| 10 | 105 | Associated International Hotels Limited | Real Estate | Real Estate Management & Development | Real Estate | 4.297000e+08 | 2023 | 3.898000e+08 | 496900000.0 | 536100000.0 | ... | 0.021 | 0.02 | 0.02 | 0.019 | 0.019 | 0.017 | 0.017 | 0.016 | 0.016 | 0.015 |
5 rows × 111 columns
ratio_ticker_list = list(ML_data.Ticker)
ratio_ticker_list_new = []
for elem in ratio_ticker_list:
ratio_ticker_list_new.append(elem+'.HK')
asset_ticker_list = list(asset_prices.columns)
combined_tickers = [value for value in asset_ticker_list if value in ratio_ticker_list_new]
combined_tickers
combined_ticker_int = []
for elem in combined_tickers:
combined_ticker_int.append(elem[:-3])
ML_final = ML_data[ML_data['Ticker'].isin(combined_ticker_int)]
# gives 61 assets but I haven't dropped NaNs in the financial ratios dataset. Final filtered dataset is expected to contain fewer assets. Edit: 7.
asset_prices[combined_tickers].head()
| 1023.HK | 1140.HK | 1243.HK | 1373.HK | 1468.HK | 1499.HK | 2322.HK | |
|---|---|---|---|---|---|---|---|
| Date | |||||||
| 2018-04-01 | 1.224793 | 2.984653 | 0.852034 | 1.133839 | 0.239 | 0.60 | 0.60 |
| 2018-05-01 | 1.210384 | 3.051189 | 0.790193 | 1.233674 | 0.237 | 0.91 | 0.70 |
| 2018-06-01 | 1.210384 | 2.956137 | 0.755837 | 1.219412 | 0.227 | 0.85 | 0.67 |
| 2018-07-01 | 1.628255 | 2.861084 | 0.755837 | 1.390557 | 0.246 | 0.72 | 0.60 |
| 2018-08-01 | 1.621050 | 2.823063 | 0.748966 | 1.411951 | 0.355 | 0.67 | 0.58 |
PCA_LDA_df = test_merge[['Ticker', 'Sector', 'Industry', 'Market Cap', 'EV',\
'FCF', 'EBITDA', 'Revenue', 'ROE', 'Gross-Profit-Margin',\
'Quick-Ratio', 'Debt / Equity']]
# remove NaNs, replace placeholder text with floats
PCA_LDA_df = PCA_LDA_df.dropna().applymap(convert_placeholder_text_to_num).dropna()
# set up datasets for LDA
#X = dataset.iloc[:, 0:4].values
#y = dataset.iloc[:, 4].values
X = PCA_LDA_df[['Market Cap', 'EV', 'FCF', 'EBITDA', 'Revenue', 'ROE', 'Gross-Profit-Margin',\
'Quick-Ratio', 'Debt / Equity']].values
y = PCA_LDA_df[['Sector']].values
y = y.flatten()
# scaler = StandardScaler()
# X_lda = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
lda = LDA(n_components=7)
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform(X_test)
plt.plot(lda.explained_variance_ratio_)
[<matplotlib.lines.Line2D at 0x7fe106ee5a20>]
lda = LinearDiscriminantAnalysis(n_components=2)
X_r2 = lda.fit(X, y).transform(X)
colors = ["navy", "turquoise", "darkorange"]
lw = 2
for color, i, target_name in zip(colors, [0, 1, 2], target_names):
plt.scatter(
X_r2[y == i, 0], X_r2[y == i, 1], alpha=0.8, color=color, label=target_name
)
plt.legend(loc="best", shadow=False, scatterpoints=1)
plt.title("LDA of IRIS dataset")
X;
# taken from https://stackoverflow.com/questions/39216897/plot-pca-loadings-and-loading-in-biplot-in-sklearn-like-rs-autoplot
def myplot(score,coeff,labels=None):
xs = score[:,0]
ys = score[:,1]
n = coeff.shape[0]
scalex = 1.0/(xs.max() - xs.min())
scaley = 1.0/(ys.max() - ys.min())
plt.scatter(xs * scalex,ys * scaley)#, c = y)
for i in range(n):
plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = 'r',alpha = 0.5)
if labels is None:
plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, "Var"+str(i+1), color = 'g', ha = 'center', va = 'center')
else:
plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = 'g', ha = 'center', va = 'center')
plt.xlim(-1,1)
plt.ylim(-1,1)
plt.xlabel("PC{}".format(1))
plt.ylabel("PC{}".format(2))
plt.grid()
scaler = StandardScaler()
scaler.fit(X)
X_pca=scaler.transform(X)
pca = PCA()
X_pca = pca.fit_transform(X_pca)
myplot(X_pca[:,0:2],np.transpose(pca.components_[0:2, :]))
plt.show()
# Using plotly
components = pca.fit_transform(X_pca)
labels = {
str(i): f"PC {i+1} ({var:.1f}%)"
for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}
fig = px.scatter_matrix(
components,
labels=labels,
dimensions=range(5),
color=y.flatten()
)
fig.update_traces(diagonal_visible=False)
fig.update_layout(
autosize=False,
width=1000,
height=800,)
fig.show()
# Percentage of variance explained for each components
print("explained variance ratio: %s" % str(pca.explained_variance_ratio_))
explained variance ratio: [5.39614297e-01 2.21427804e-01 1.13341854e-01 9.61741960e-02 1.36704757e-02 1.10874438e-02 3.20397805e-03 1.29777607e-03 1.82176038e-04]
plt.plot(pca.explained_variance_ratio_)
plt.xlabel('n_component');
plt.ylabel('Explained variance ratio');
plt.savefig('pca_scree.png',dpi=600)
Resample to quarterly intervals.
# test code
asset_prices = test_assets['Adj Close'].dropna(axis=1)
# resample monthly to quarterly intervals
asset_prices = asset_prices[['2322.HK','1023.HK','1468.HK','1243.HK','1373.HK','1140.HK','1499.HK']].resample('Q').last()
# drop the last quarter
asset_prices = asset_prices.iloc[:-1]
# test code cont.
# reverse the order of the df, calculate % change i.e. returns, then select datapoints corresponding to financial ratios.
returns_rhs = asset_prices[::-1].pct_change(-1).iloc[:13].rename(columns={'2322.HK': '2322.HK ret', '1023.HK': '1023.HK ret',\
'1468.HK': '1468.HK ret','1243.HK': '1243.HK ret',\
'1373.HK': '1373.HK ret','1140.HK': '1140.HK ret',\
'1499.HK': '1499.HK ret'})
def get_train_test_from_fratio(fratio):
'''
Given a financial ratio (list below), return a dataframe [X1,X2,...,y1,y2,...] where Xi are the financial ratios and yi are the % returns for the asset.
Vertical axis is time (in decreasing order)
Financial ratios:
- EV
- FCF
- EBITDA
- Revenue
- ROE
- Gross-Profit-Margin
- Quick-Ratio
- Debt / Equity
'''
EV_lhs = ML_final.loc[:,ML_final.columns.str.contains(fratio)].transpose().reset_index().rename(columns={1: '2322.HK', 13: '1023.HK',\
18: '1468.HK',20: '1243.HK',\
25: '1373.HK',57: '1140.HK',\
96: '1499.HK'})
# reverse the order of the df, calculate % change i.e. returns, then select datapoints corresponding to financial ratios.
returns_rhs = asset_prices[::-1].pct_change(-1).iloc[:13].rename(columns={'2322.HK': '2322.HK ret', '1023.HK': '1023.HK ret',\
'1468.HK': '1468.HK ret','1243.HK': '1243.HK ret',\
'1373.HK': '1373.HK ret','1140.HK': '1140.HK ret',\
'1499.HK': '1499.HK ret'})
df = pd.concat([EV_lhs, returns_rhs.reset_index()],axis=1)[['Date','2322.HK','2322.HK','1023.HK','1468.HK','1243.HK','1373.HK',\
'1140.HK','1499.HK','2322.HK ret','1023.HK ret','1468.HK ret',\
'1243.HK ret','1373.HK ret','1140.HK ret','1499.HK ret']].set_index('Date')
return df
Define ML data each frame refers to a particular financial ratio
X_EV = get_train_test_from_fratio('EV')
X_FCF = get_train_test_from_fratio('FCF')
X_EBIT = get_train_test_from_fratio('EBITDA')
X_REV = get_train_test_from_fratio('Revenue')
X_ROE = get_train_test_from_fratio('ROE')
X_GPM = get_train_test_from_fratio('Gross-Profit-Margin')
X_QUICK = get_train_test_from_fratio('Quick-Ratio')
X_DE = get_train_test_from_fratio('Debt / Equity')
X_EV.to_csv('data/X_EV_MLdata.csv')
X_FCF.to_csv('data/X_FCF_MLdata.csv')
X_EBIT.to_csv('data/X_EBIT_MLdata.csv')
X_REV.to_csv('data/X_REV_MLdata.csv')
X_ROE.to_csv('data/X_ROE_MLdata.csv')
X_GPM.to_csv('data/X_GPM_MLdata.csv')
X_QUICK.to_csv('data/X_QUICK_MLdata.csv')
X_DE.to_csv('data/X_DE_MLdata.csv')
# taken from https://towardsdatascience.com/the-complete-guide-to-time-series-forecasting-using-sklearn-pandas-and-numpy-7694c90e45c1
def ts_test_train_split(df):
'''
For dataframe of the format [X1,X2,...,y1,y2,...] generated from get_train_test_from_fratio, return training and test set of data
'''
X_cols = [col for col in df.columns if col.endswith('HK')]
y_cols = [col for col in df.columns if col.endswith('ret')]
X_train = df[X_cols][:-2].values
y_train = df[y_cols][:-2].values
X_test = df[X_cols][-2:].values
y_test = df[y_cols][-2:].values
return X_train, y_train, X_test, y_test
X_train, y_train, X_test, y_test = ts_test_train_split(X_EV)
y_test
array([[ 0.65999997, -0.55288456, -0.20779217, 0.1025641 , -0.03719925,
-0.25999999, -0.17543859],
[ 0.19047623, -0.1476528 , 0.56504057, -0.18771023, 0.01966014,
-0.04458602, 0.4690721 ]])
def ML_decision_tree_boosting(X_train,y_train,X_test,y_test):
'''
Run a decision tree regressor and gradient boosting regressor and return coefficient of determination on both training / test sets.
Gradient boosting regressor is chained so the model will try to predict sequential returns from past inputs.
See: https://towardsdatascience.com/the-complete-guide-to-time-series-forecasting-using-sklearn-pandas-and-numpy-7694c90e45c1
'''
dt_seq = DecisionTreeRegressor(random_state=42)
dt_seq.fit(X_train, y_train)
dt_seq_preds = dt_seq.predict(X_test)
gbr_seq = GradientBoostingRegressor(random_state=42)
chained_gbr = RegressorChain(gbr_seq)
chained_gbr.fit(X_train, y_train)
gbr_seq_preds = chained_gbr.predict(X_test)
dt_train_score = dt_seq.score(X_train,y_train)
dt_test_score = dt_seq.score(X_test,y_test)
cgbr_train_score = chained_gbr.score(X_train,y_train)
cgbr_test_score = chained_gbr.score(X_test,y_test)
return dt_train_score, dt_test_score, cgbr_train_score, cgbr_test_score
# apply ML function to all financial ratio dataframes
financial_ratios_dict = {'X_EV': X_EV,
'X_FCF': X_FCF,
'X_EBIT': X_EBIT,
'X_REV': X_REV,
'X_ROE': X_ROE,
'X_GPM': X_GPM,
'X_QUICK': X_QUICK,
'X_DE': X_DE}
for elem in financial_ratios_dict.keys():
X_train, y_train, X_test, y_test = ts_test_train_split(financial_ratios_dict[elem])
dt_train_score, dt_test_score, cgbr_train_score, cgbr_test_score = ML_decision_tree_boosting(X_train,y_train,X_test,y_test)
print(str(elem) + ' dt train score: ' +str(dt_train_score) + ', dt test score: ' + str(dt_test_score)+\
', cgbr train score: '+ str(cgbr_train_score) + ', cgbr test score: '+str(cgbr_test_score))
X_EV dt train score: 1.0, dt test score: -2.581986121565467, cgbr train score: 0.9999999956582766, cgbr test score: -5.163514426569127 X_FCF dt train score: 1.0, dt test score: -5.058310243181415, cgbr train score: 0.9999999947948892, cgbr test score: -7.744285351237203 X_EBIT dt train score: 1.0, dt test score: -8.477140958532724, cgbr train score: 0.9999998966063013, cgbr test score: -9.077284584457173 X_REV dt train score: 1.0, dt test score: -6.648629112063416, cgbr train score: 0.999999993985728, cgbr test score: -5.569041106992259 X_ROE dt train score: 1.0, dt test score: -6.648629112063416, cgbr train score: 0.9999999968451213, cgbr test score: -4.433750342075148 X_GPM dt train score: 0.6504882948750798, dt test score: -8.31246456204073, cgbr train score: 0.42253245611372925, cgbr test score: -4.963616181777584 X_QUICK dt train score: 0.6112198297949563, dt test score: -6.648629112063416, cgbr train score: 0.2922739158006787, cgbr test score: -5.162086510747484 X_DE dt train score: 0.6112198297949563, dt test score: -6.648629112063416, cgbr train score: 0.29300018255057064, cgbr test score: -9.32353157453062